{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将所有特征串联起来，构成RS_Train.csv\n",
    "#RS_Test.csv\n",
    "#为最后推荐系统做准备\n",
    "from __future__ import division\n",
    "\n",
    "import cPickle\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "from numpy.random import random  \n",
    "from collections import defaultdict\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "class RecommonderSystem:\n",
    "  def __init__(self):\n",
    "    # 读入数据做初始化\n",
    "    \n",
    "    #用户和活动新的索引\n",
    "    self.userIndex = cPickle.load(open(\"PE_userIndex.pkl\", 'rb'))\n",
    "    self.eventIndex = cPickle.load(open(\"PE_eventIndex.pkl\", 'rb'))\n",
    "    self.n_users = len(self.userIndex)\n",
    "    self.n_items = len(self.eventIndex)\n",
    "    \n",
    "    #用户-活动关系矩阵R\n",
    "    #在train_SVD会重新从文件中读取,二者要求的格式不同，来不及统一了:(\n",
    "    self.userEventScores = sio.mmread(\"PE_userEventScores\").todense()\n",
    "    #每个用户的平均打分，用于基于物品的协同过滤中物品相似度度量中去掉用户打分习惯的影响\n",
    "    self.userMeanScore = np.mean(self.userEventScores, axis = 1)\n",
    "   \n",
    "    #倒排表\n",
    "    ##每个用户参加的事件\n",
    "    self.itemsForUser = cPickle.load(open(\"PE_eventsForUser.pkl\", 'rb'))\n",
    "    ##事件参加的用户\n",
    "    self.usersForItem = cPickle.load(open(\"PE_usersForEvent.pkl\", 'rb'))\n",
    "    \n",
    "    #基于模型的协同过滤参数初始化,训练\n",
    "    self.init_SVD()\n",
    "    self.train_SVD(trainfile = \"train.csv\")\n",
    "    \n",
    "    #根据用户属性计算出的用户之间的相似度\n",
    "    self.userSimMatrix = sio.mmread(\"US_userSimMatrix\").todense()\n",
    "    \n",
    "    #根据活动属性计算出的活动之间的相似度\n",
    "    self.eventPropSim = sio.mmread(\"EV_eventPropSim\").todense()\n",
    "    self.eventContSim = sio.mmread(\"EV_eventContSim\").todense()\n",
    "    \n",
    "    #每个用户的朋友的数目\n",
    "    self.numFriends = sio.mmread(\"UF_numFriends\")\n",
    "    #用户的每个朋友参加活动的分数对该用户的影响\n",
    "    self.userFriends = sio.mmread(\"UF_userFriends\").todense()\n",
    "    \n",
    "    #活动本身的热度\n",
    "    self.eventPopularity = sio.mmread(\"EA_eventPopularity\").todense()\n",
    "\n",
    "  def init_SVD(self, K=20):\n",
    "    #初始化模型参数（for 基于模型的协同过滤SVD_CF）\n",
    "    self.K = K  \n",
    "    \n",
    "    #init parameters\n",
    "    #bias\n",
    "    self.bi = np.zeros(self.n_items)  \n",
    "    self.bu = np.zeros(self.n_users)  \n",
    "    \n",
    "    #the small matrix\n",
    "    self.P = random((self.n_users,self.K))/10*(np.sqrt(self.K))\n",
    "    self.Q = random((self.K, self.n_items))/10*(np.sqrt(self.K))  \n",
    "                  \n",
    "          \n",
    "  def train_SVD(self,trainfile = 'train.csv', steps=100,gamma=0.04,Lambda=0.15):\n",
    "    #训练SVD模型（for 基于模型的协同过滤SVD_CF）\n",
    "    #gamma：为学习率\n",
    "    #Lambda：正则参数\n",
    "    \n",
    "    #偷懒了，为了和原来的代码的输入接口一样，直接从训练文件中去读取数据\n",
    "    print \"SVD Train...\"\n",
    "    ftrain = open(trainfile, 'r')\n",
    "    ftrain.readline()\n",
    "    self.mu = 0.0\n",
    "    n_records = 0\n",
    "    uids = []  #每条记录的用户索引\n",
    "    i_ids = [] #每条记录的item索引\n",
    "    #用户-Item关系矩阵R（内容同userEventScores相同），临时变量，训练完了R不再需要\n",
    "    R = np.zeros((self.n_users, self.n_items))\n",
    "    \n",
    "    for line in ftrain:\n",
    "        cols = line.strip().split(\",\")\n",
    "        u = self.userIndex[cols[0]]  #用户\n",
    "        i = self.eventIndex[cols[1]] #活动\n",
    "        \n",
    "        uids.append(u)\n",
    "        i_ids.append(i)\n",
    "        \n",
    "        R[u,i] = int(cols[4])  #interested\n",
    "        self.mu += R[u,i]\n",
    "        n_records += 1\n",
    "    \n",
    "    ftrain.close()\n",
    "    self.mu /= n_records\n",
    "    \n",
    "    for step in range(steps):  \n",
    "        #print 'the ',step,'-th  step is running'  \n",
    "        rmse_sum=0.0 \n",
    "            \n",
    "        #将训练样本打散顺序\n",
    "        kk = np.random.permutation(n_records)  \n",
    "        for j in range(n_records):  \n",
    "            #每次一个训练样本\n",
    "            index = kk[j]  \n",
    "            #temp = self.nonzero_scores_index[b]\n",
    "            #u = temp[0]\n",
    "            #i = temp[1]\n",
    "            u = uids[index]\n",
    "            i = i_ids[index]\n",
    " \n",
    "            #预测残差\n",
    "            eui = R[u,i] - self.pred_SVD(u,i)\n",
    "            #残差平方和\n",
    "            rmse_sum += eui**2\n",
    "               \n",
    "            #随机梯度下降，更新\n",
    "            self.bu[u]+= gamma*(eui - Lambda*self.bu[u])  \n",
    "            self.bi[i]+= gamma*(eui - Lambda*self.bi[i]) \n",
    "            \n",
    "            for k in range(self.K):\n",
    "                #P,Q 同时更新，temp暂存P的更新之的值\n",
    "                temp = self.P[u,k] + gamma * eui * self.Q[k,i] - Lambda * self.P[u,k]\n",
    "                #self.P[u,k] += gamma * eui * self.Q[k,i] - Lambda * self.P[u,k]\n",
    "                self.Q[k,i] += gamma * eui * self.P[u,k] - Lambda * self.Q[k,i]\n",
    "                self.P[u,k] = temp\n",
    "                \n",
    "        #学习率递减\n",
    "        gamma=gamma*0.93  \n",
    "        #print(\"the rmse of the {} th step on train data is:{}\".format(step, rmse_sum))\n",
    "    print \"SVD trained\"\n",
    "    \n",
    "  def pred_SVD(self, uid, i_id):\n",
    "    #根据当前参数，预测用户uid对Item（i_id）的打分        \n",
    "    ans=self.mu + self.bi[i_id] + self.bu[uid] + np.dot(self.P[uid,:],self.Q[:,i_id])  \n",
    "        \n",
    "    #将打分范围控制在0-1之间\n",
    "    if ans>1:  \n",
    "        return 1  \n",
    "    elif ans<0:  \n",
    "        return 0\n",
    "    return ans  \n",
    "\n",
    "  def sim_cal_UserCF(self, uid1, uid2 ):\n",
    "    si={}  #有效item（两个用户均有打分的item）的集合\n",
    "    for item in self.itemsForUser[uid1]:  #uid1所有打过分的Item1\n",
    "        if item in self.itemsForUser[uid2]:  #如果uid2也对该Item打过分\n",
    "            si[item]=1  #item为一个有效item\n",
    "        \n",
    "    #print si\n",
    "    n=len(si)   #有效item数，有效item为即对uid对Item打过分，uid2也对Item打过分\n",
    "    if (n==0):  #没有共同打过分的item，相似度设为0？\n",
    "        similarity=0  \n",
    "        return similarity  \n",
    "        \n",
    "    #用户uid1打过分的所有有效的item\n",
    "    s1=np.array([self.userEventScores[uid1,item] for item in si])  \n",
    "        \n",
    "    #用户uid2打过分的所有有效的Item\n",
    "    s2=np.array([self.userEventScores[uid2,item] for item in si])  \n",
    "        \n",
    "    sum1=np.sum(s1)  \n",
    "    sum2=np.sum(s2)  \n",
    "    sum1Sq=np.sum(s1**2)  \n",
    "    sum2Sq=np.sum(s2**2)  \n",
    "    pSum=np.sum(s1*s2)  \n",
    "        \n",
    "    #分子\n",
    "    num=pSum-(sum1*sum2/n)  \n",
    "        \n",
    "    #分母\n",
    "    den=np.sqrt((sum1Sq-sum1**2/n)*(sum2Sq-sum2**2/n))  \n",
    "    if den==0: #这个数据点不好用，舍弃 \n",
    "        similarity=0  \n",
    "        return 0  \n",
    "        \n",
    "    similarity = num/den  \n",
    "    return similarity  \n",
    "\n",
    "  def userCFReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    根据User-based协同过滤，得到event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i\n",
    "      for every other user v that has a preference for i\n",
    "        compute similarity s between u and v\n",
    "        incorporate v's preference for i weighted by s into running aversge\n",
    "    return top items ranked by weighted average\n",
    "    \"\"\"\n",
    "    \n",
    "    u = self.userIndex[userId]\n",
    "    i = self.eventIndex[eventId]\n",
    "    \n",
    "    sim_accumulate=0.0  \n",
    "    rat_acc=0.0  \n",
    "\n",
    "    for user in self.usersForItem[i]:  #对eventId打过分的所有用户\n",
    "        #print user, u\n",
    "        sim = self.sim_cal_UserCF(uid1 = user,uid2 = u)    #该user与uid之间的相似度\n",
    "        if sim == 0:continue  \n",
    "            #print sim,self.user_movie[uid][item],sim*self.user_movie[uid][item]  \n",
    "            \n",
    "        #u2 = self.userIndex[user]\n",
    "        rat_acc += sim * self.userEventScores[user,i]   #用户user对eventId的打分\n",
    "        sim_accumulate += sim  \n",
    "        \n",
    "    #print rat_acc,sim_accumulate  \n",
    "    if sim_accumulate==0: #no same user rated,return average rates of the data  \n",
    "        return  self.mu  \n",
    "    ans = rat_acc/sim_accumulate  \n",
    "\n",
    "    #将打分范围控制在0-1之间\n",
    "    if ans>1:  \n",
    "        return 1  \n",
    "    elif ans<0:  \n",
    "        return 0  \n",
    "    return ans\n",
    "\n",
    "\n",
    "  def sim_cal_ItemCF(self, i_id1, i_id2):\n",
    "    #计算Item i_id1和i_id2之间的相似性\n",
    "    si={}  #有效用户集合\n",
    "    for user in self.usersForItem[i_id1]:  #所有对Item1打过分的的user\n",
    "        if user in self.usersForItem[i_id2]:  #如果该用户对Item2也打过分\n",
    "            si[user]=1  #user为一个有效用用户\n",
    "        \n",
    "    n=len(si)   #有效用户数，有效用户为即对Item1打过分，也对Item2打过分\n",
    "    if (n==0):  #没有共同打过分的用户，相似度设为0？\n",
    "        return 0  \n",
    "        \n",
    "    #所有有效用户对Item1的打分\n",
    "    s1=np.array([self.userEventScores[u, i_id1] for u in si])  \n",
    "        \n",
    "    #所有有效用户对Item2的打分\n",
    "    s2=np.array([self.userEventScores[u, i_id2] for u in si])\n",
    "    \n",
    "\n",
    "    #修正有效打分, 减去用户的平均打分\n",
    "    user_mean_score = np.array([self.userMeanScore[u,0] for u in si])\n",
    "    s1 = s1 - user_mean_score;\n",
    "    s2 = s2 - user_mean_score;\n",
    "   \n",
    "    #余弦相似度\n",
    "    sim = 1- ssd.cosine(s1, s2)\n",
    "    return sim  \n",
    "            \n",
    "  def eventCFReco(self, userId, eventId):    \n",
    "    \"\"\"\n",
    "    根据基于物品的协同过滤，得到Event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i \n",
    "        for every item j tht u has a preference for\n",
    "            compute similarity s between i and j\n",
    "            add u's preference for j weighted by s to a running average\n",
    "    return top items, ranked by weighted average\n",
    "    \"\"\"\n",
    "    u = self.userIndex[userId]\n",
    "    i = self.eventIndex[eventId]\n",
    "\n",
    "    sim_accumulate=0.0  \n",
    "    rat_acc=0.0  \n",
    "                   \n",
    "    for item in self.itemsForUser[u]:  #用户uid打过分的所有Item\n",
    "        #i2 = self.eventIndex[item]\n",
    "        sim = self.sim_cal_ItemCF(item,i)    #该Item与i_id之间的相似度\n",
    "           \n",
    "        rat_acc += sim * self.userEventScores[u,item]  \n",
    "        sim_accumulate += sim  \n",
    "        \n",
    "    #print rat_acc,sim_accumulate  \n",
    "    if sim_accumulate==0: #no same user rated,return average rates of the data  \n",
    "        return  self.mu  \n",
    "\n",
    "    ans = rat_acc/sim_accumulate  \n",
    "\n",
    "    #将打分范围控制在0-1之间\n",
    "    if ans>1:  \n",
    "        return 1  \n",
    "    elif ans<0:  \n",
    "        return 0\n",
    "    return ans\n",
    "    \n",
    "  def svdCFReco(self, userId, eventId):\n",
    "    #基于模型的协同过滤, SVD++/LFM\n",
    "    u = self.userIndex[userId]\n",
    "    i = self.eventIndex[eventId]\n",
    "\n",
    "    return self.pred_SVD(u,i)\n",
    "\n",
    "  def userReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    类似基于User-based协同过滤，只是用户之间的相似度由用户本身的属性得到，计算event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i\n",
    "      for every other user v that has a preference for i\n",
    "        compute similarity s between u and v\n",
    "        incorporate v's preference for i weighted by s into running aversge\n",
    "    return top items ranked by weighted average\n",
    "    \"\"\"\n",
    "    i = self.userIndex[userId]\n",
    "    j = self.eventIndex[eventId]\n",
    "\n",
    "    vs = self.userEventScores[:, j]\n",
    "    sims = self.userSimMatrix[i, :]\n",
    "\n",
    "    prod = sims * vs\n",
    "\n",
    "    try:\n",
    "      return prod[0, 0] - self.userEventScores[i, j]\n",
    "    except IndexError:\n",
    "      return 0\n",
    "\n",
    "  def eventReco(self, userId, eventId):\n",
    "    \"\"\"\n",
    "    类似基于Item-based协同过滤，只是item之间的相似度由item本身的属性得到，计算Event的推荐度\n",
    "    基本的伪代码思路如下：\n",
    "    for item i \n",
    "      for every item j that u has a preference for\n",
    "        compute similarity s between i and j\n",
    "        add u's preference for j weighted by s to a running average\n",
    "    return top items, ranked by weighted average\n",
    "    \"\"\"\n",
    "    i = self.userIndex[userId]\n",
    "    j = self.eventIndex[eventId]\n",
    "    js = self.userEventScores[i, :]\n",
    "    psim = self.eventPropSim[:, j]\n",
    "    csim = self.eventContSim[:, j]\n",
    "    pprod = js * psim\n",
    "    cprod = js * csim\n",
    "    \n",
    "    pscore = 0\n",
    "    cscore = 0\n",
    "    try:\n",
    "      pscore = pprod[0, 0] - self.userEventScores[i, j]\n",
    "    except IndexError:\n",
    "      pass\n",
    "    try:\n",
    "      cscore = cprod[0, 0] - self.userEventScores[i, j]\n",
    "    except IndexError:\n",
    "      pass\n",
    "    return pscore, cscore\n",
    "\n",
    "  def userPop(self, userId):\n",
    "    \"\"\"\n",
    "    基于用户的朋友个数来推断用户的社交程度\n",
    "    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动\n",
    "    \"\"\"\n",
    "    if self.userIndex.has_key(userId):\n",
    "      i = self.userIndex[userId]\n",
    "      try:\n",
    "        return self.numFriends[0, i]\n",
    "      except IndexError:\n",
    "        return 0\n",
    "    else:\n",
    "      return 0\n",
    "\n",
    "  def friendInfluence(self, userId):\n",
    "    \"\"\"\n",
    "    朋友对用户的影响\n",
    "    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的\n",
    "    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响\n",
    "    \"\"\"\n",
    "    nusers = np.shape(self.userFriends)[1]\n",
    "    i = self.userIndex[userId]\n",
    "    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]\n",
    "\n",
    "  def eventPop(self, eventId):\n",
    "    \"\"\"\n",
    "    本活动本身的热度\n",
    "    主要是通过参与的人数来界定的\n",
    "    \"\"\"\n",
    "    i = self.eventIndex[eventId]\n",
    "    return self.eventPopularity[i, 0]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def generateRSData(RS, train=True, header=True):\n",
    "    \"\"\"\n",
    "    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起\n",
    "    生成新的训练数据，用于分类器分类使用\n",
    "    \"\"\"\n",
    "    fn = \"train.csv\" if train else \"test.csv\"\n",
    "    fin = open(fn, 'rb')\n",
    "    fout = open(\"RS_\" + fn, 'wb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    fin.readline().strip().split(\",\")\n",
    "    \n",
    "    # write output header\n",
    "    if header:\n",
    "      ocolnames = [\"invited\", \"userCF_reco\", \"evtCF_reco\",\"svdCF_reco\",\"user_reco\", \"evt_p_reco\",\n",
    "        \"evt_c_reco\", \"user_pop\", \"frnd_infl\", \"evt_pop\"]\n",
    "      if train:\n",
    "        ocolnames.append(\"interested\")\n",
    "        ocolnames.append(\"not_interested\")\n",
    "      fout.write(\",\".join(ocolnames) + \"\\n\")\n",
    "    \n",
    "    ln = 0\n",
    "    for line in fin:\n",
    "      ln += 1\n",
    "      if ln%500 == 0:\n",
    "          print \"%s:%d (userId, eventId)=(%s, %s)\" % (fn, ln, userId, eventId)\n",
    "          #break;\n",
    "      \n",
    "      cols = line.strip().split(\",\")\n",
    "      userId = cols[0]\n",
    "      eventId = cols[1]\n",
    "      invited = cols[2]\n",
    "      \n",
    "      #协同过滤推荐\n",
    "      userCF_reco = RS.userCFReco(userId, eventId)\n",
    "      itemCF_reco = RS.eventCFReco(userId, eventId)\n",
    "      svdCF_reco = RS.svdCFReco(userId, eventId)\n",
    "      \n",
    "      #基于用户属性相似度的推荐\n",
    "      user_reco = RS.userReco(userId, eventId)\n",
    "    \n",
    "      #基于活动属性相似度的推荐\n",
    "      evt_p_reco, evt_c_reco = RS.eventReco(userId, eventId)\n",
    "    \n",
    "      #基于用户的朋友个数来推断用户的社交程度\n",
    "      user_pop = RS.userPop(userId)\n",
    "     \n",
    "      #基于用户社交属性的推荐\n",
    "      frnd_infl = RS.friendInfluence(userId)\n",
    "        \n",
    "      #基于活动热度的推荐  \n",
    "      evt_pop = RS.eventPop(eventId)\n",
    "        \n",
    "      #所有推荐度串联  \n",
    "      ocols = [invited, userCF_reco, itemCF_reco, svdCF_reco,user_reco, evt_p_reco,\n",
    "        evt_c_reco, user_pop, frnd_infl, evt_pop]\n",
    "      \n",
    "      if train:\n",
    "        ocols.append(cols[4]) # interested\n",
    "        ocols.append(cols[5]) # not_interested\n",
    "      fout.write(\",\".join(map(lambda x: str(x), ocols)) + \"\\n\")\n",
    "    \n",
    "    fin.close()\n",
    "    fout.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SVD Train...\n",
      "SVD trained\n",
      "生成训练数据...\n",
      "\n",
      "train.csv:500 (userId, eventId)=(123290209, 1887085024)\n",
      "train.csv:1000 (userId, eventId)=(272886293, 199858305)\n",
      "train.csv:1500 (userId, eventId)=(395305791, 1582270949)\n",
      "train.csv:2000 (userId, eventId)=(527523423, 3272728211)\n",
      "train.csv:2500 (userId, eventId)=(651258472, 792632006)\n",
      "train.csv:3000 (userId, eventId)=(811791433, 524756826)\n",
      "train.csv:3500 (userId, eventId)=(985547042, 1269035551)\n",
      "train.csv:4000 (userId, eventId)=(1107615001, 173949238)\n",
      "train.csv:4500 (userId, eventId)=(1236336671, 3849306291)\n",
      "train.csv:5000 (userId, eventId)=(1414301782, 2652356640)\n",
      "train.csv:5500 (userId, eventId)=(1595465532, 955398943)\n",
      "train.csv:6000 (userId, eventId)=(1747091728, 2131379889)\n",
      "train.csv:6500 (userId, eventId)=(1914182220, 955398943)\n",
      "train.csv:7000 (userId, eventId)=(2071842684, 1076364848)\n",
      "train.csv:7500 (userId, eventId)=(2217853337, 3051438735)\n",
      "train.csv:8000 (userId, eventId)=(2338481531, 2525447278)\n",
      "train.csv:8500 (userId, eventId)=(2489551967, 520657921)\n",
      "train.csv:9000 (userId, eventId)=(2650493630, 87962584)\n",
      "train.csv:9500 (userId, eventId)=(2791418962, 4223848259)\n",
      "train.csv:10000 (userId, eventId)=(2903662804, 2791462807)\n",
      "train.csv:10500 (userId, eventId)=(3036141956, 3929507420)\n",
      "train.csv:11000 (userId, eventId)=(3176074542, 3459485614)\n",
      "train.csv:11500 (userId, eventId)=(3285425249, 2271782630)\n",
      "train.csv:12000 (userId, eventId)=(3410667855, 1063772489)\n",
      "train.csv:12500 (userId, eventId)=(3531604778, 2584839423)\n",
      "train.csv:13000 (userId, eventId)=(3686871863, 53495098)\n",
      "train.csv:13500 (userId, eventId)=(3833637800, 2415873572)\n",
      "train.csv:14000 (userId, eventId)=(3944021305, 2096772901)\n",
      "train.csv:14500 (userId, eventId)=(4075466480, 3567240505)\n",
      "train.csv:15000 (userId, eventId)=(4197193550, 1628057176)\n",
      "生成预测数据...\n",
      "\n",
      "test.csv:500 (userId, eventId)=(182290053, 2529072432)\n",
      "test.csv:1000 (userId, eventId)=(433510318, 4244463632)\n",
      "test.csv:1500 (userId, eventId)=(632808865, 2845303452)\n",
      "test.csv:2000 (userId, eventId)=(813611885, 2036538169)\n",
      "test.csv:2500 (userId, eventId)=(1010701404, 303459881)\n",
      "test.csv:3000 (userId, eventId)=(1210932037, 2529072432)\n",
      "test.csv:3500 (userId, eventId)=(1452921099, 2705317682)\n",
      "test.csv:4000 (userId, eventId)=(1623287180, 1626678328)\n",
      "test.csv:4500 (userId, eventId)=(1855201342, 2603032829)\n",
      "test.csv:5000 (userId, eventId)=(2083900381, 2529072432)\n",
      "test.csv:5500 (userId, eventId)=(2318415276, 2509151803)\n",
      "test.csv:6000 (userId, eventId)=(2528161539, 4025975316)\n",
      "test.csv:6500 (userId, eventId)=(2749110768, 4244406355)\n",
      "test.csv:7000 (userId, eventId)=(2927772127, 1532377761)\n",
      "test.csv:7500 (userId, eventId)=(3199685636, 1776393554)\n",
      "test.csv:8000 (userId, eventId)=(3393388475, 680270887)\n",
      "test.csv:8500 (userId, eventId)=(3601169721, 154434302)\n",
      "test.csv:9000 (userId, eventId)=(3828963415, 3067222491)\n",
      "test.csv:9500 (userId, eventId)=(4018723397, 2522610844)\n",
      "test.csv:10000 (userId, eventId)=(4180064266, 2658555390)\n"
     ]
    }
   ],
   "source": [
    "RS = RecommonderSystem()\n",
    "print \"生成训练数据...\\n\"\n",
    "generateRSData(RS,train=True,  header=True)\n",
    "\n",
    "print \"生成预测数据...\\n\"\n",
    "generateRSData(RS, train=False, header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "时间、地点等特征都没有处理了，可以考虑用户看到event的时间与event开始时间的差、用户地点和event地点的差异。。。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
